clear
set more off
#delimit ;
/*****************************************************************************************************************;
**This dofile starts from expandedDHSGPSYYYY and generates collapsedGPSYYYY;

other datasets this dofile also GENERATES for the simulated arsenic measures:
hhXwelldata_uncontaminated_YYYY
hhXwelldata_contaminated_YYYY

other pre-generated datasets this dofile also uses:
BGS Survey Cleaned.csv
*****************************************************************************************************************/;

*****************************************************************************************************************;
*Paths;
*****************************************************************************************************************;

*access files in subdirectory generated by data creating dofiles;
local indataset "dtafiles/expandedDHSGPS";

local outdataset "dtafiles/collapsedGPS";
local simul_dataset "dtafiles/hhXwelldata_uncontaminated";
local simul_datasetC "dtafiles/hhXwelldata_contaminated";

*****************************************************************************************************************;
*Main;
*****************************************************************************************************************;

*install necessary packages/adofiles;
ssc inst vincenty, replace;

**Import and clean well data**;
insheet using "rawdta/BGS Survey/BGS Survey Cleaned.csv", names;

foreach varname of varlist sample_id-zn {;
	local temp=`varname'[1];
	di "`temp'";
	if "`temp'"!="." {; label variable `varname' "`varname' `temp'"; };
};
drop if _n==1;

**Generates a sequential id number for wells**;
gen wellcount=_n;
sort wellcount;

label variable sample_id "Well Id number";

tempfile BGS_labelled;
save `BGS_labelled', replace;

foreach YYYY in 2004 2007 1999 {;

**For each DHS id number, assigns wellcount values from 1 to 3534 so each DHS id is matched with each wellcount value;
use "`indataset'`YYYY'.dta", clear;
bysort dhsid: gen wellcount=_n;
sort wellcount; 

**Merges on wellcount so every well is assigned to each DHS id;
merge m:1 wellcount using `BGS_labelled';

*Arsenic is a string and it would be more useful as a number: ren as arsenic;
gen arsenic=real(as);
replace arsenic=0.5 if as=="< 0.5";
replace arsenic=6 if as=="< 6";

label variable arsenic "Arsenic contamination of well";
ren lat_deg latwell;
label variable latwell "latitude of well";
ren long_deg longwell;
label variable longwell "longitude of well";
ren latnum lathhid;
label variable lathhid "latitude of household";
ren longnum longhhid;
label variable longhhid "longitude of household";
label variable well_depth "depth of well";
label variable dhsid "id number of household";

**Calculates distance between DHS cluster and each well and finds the closest well to each DHS cluster**;
vincenty latwell longwell lathhid longhhid, h(distance); 
label variable distance "distance between well and household";

gen uncontaminatedcont=arsenic if arsenic<50;
gen latwellu=latwell;
replace latwellu=. if uncontaminatedcont==.;
gen longwellu=longwell;
replace longwellu=. if uncontaminatedcont==.;
vincenty latwellu longwellu lathhid longhhid, h(distanceu);
egen mindistanceu=min(distanceu), by(dhsid);
label variable mindistanceu "minimum distance from cluster to uncontaminated well";

gen contaminatedcont=arsenic if arsenic>=50;
gen latwellc=latwell;
replace latwellc=. if contaminatedcont==.;
gen longwellc=longwell;
replace longwellc=. if contaminatedcont==.;
vincenty latwellc longwellc lathhid longhhid, h(distancec);
egen mindistancec=min(distancec), by(dhsid);
label variable mindistancec "minimum distance from cluster to contaminated well";

**HW: generates variables based on well data for each DHS cluster**;
gen as_walkable5=arsenic if distance<5;
label variable as_walkable5 "Arsenic level of wells within walking distance to household";

gen contaminated=1 if arsenic>=50;
gen contaminated5=contaminated if distance<5;

gen uncontaminated=1 if arsenic<50;
gen uncontaminated5=uncontaminated if distance<5;

compress;
preserve;

keep if uncontaminated==1;
sum mindistanceu;
keep if distance<r(max)+10.1;
keep dhsid wellcount lathhid longhhid latwell longwell distance arsenic;

saveold "`simul_dataset'_`YYYY'.dta", replace;
restore;

preserve;

keep if contaminated==1;
sum mindistancec;
keep if distance<r(max)+10.1;
keep dhsid wellcount lathhid longhhid latwell longwell distance arsenic;

saveold "`simul_datasetC'_`YYYY'.dta", replace;
restore;

**HW: collapses to one observation per DHS cluster**;
collapse (mean) as_walkable5   
		(count) numwalkable5=as_walkable5 numcontaminated5=contaminated5  , by(dhsid); 

gen fractioncont5=(numcontaminated5/numwalkable5);
gen as_walkable5_g=as_walkable5/1000;
drop as_walkable5;

local YY=substr("`YYYY'",-2,2);
gen dhsid`YY'=real(substr(dhsid,-3,3));

save "`outdataset'`YYYY'.dta", replace;

};



 
